writeLines("td, th { padding : 3px } th { background-color:white; color:black; border:1px solid black; text-align:center } td {color:black; border:1px solid black; word-wrap:break-word; white-space:nowrap; overflow: hidden; text-overflow: ellipsis; max-width:300px; text-align:left}", con= "../css/style.css")# Plot Boxplots faceted by shared metabolites
################################################################################
p <- plot_df %>%
ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
geom_boxplot(alpha = 0.8) +
labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
x = "Abundance", y = "")
plot(p)# Identify Zero/Missing Values in Metabolites
################################################################################
na_df <- plot_df %>%
ungroup() %>%
group_by(METABOLITE_NAME, REPLICATE) %>%
mutate(ZERO_LOG = ifelse(VALUE == 0, 1, 0)) %>%
mutate(ZERO_N = sum(ZERO_LOG)) %>%
mutate(ZERO_FREQ = round(ZERO_N/length(shared_sams), digits = 2)) %>%
mutate(NA_LOG = ifelse(is.na(VALUE), 1, 0)) %>%
mutate(NA_N = sum(NA_LOG)) %>%
mutate(NA_FREQ = round(NA_N/length(shared_sams), digits = 2)) %>%
select(REPLICATE, METABOLITE_NAME, ZERO_N, ZERO_FREQ, NA_N, NA_FREQ) %>%
unique() %>%
arrange(REPLICATE, desc(ZERO_FREQ), desc(NA_FREQ)) %>%
ungroup()
na_df %>%
knitr::kable(format = "html") %>%
scroll_box(width = "100%", height = "400px")| REPLICATE | METABOLITE_NAME | ZERO_N | ZERO_FREQ | NA_N | NA_FREQ |
|---|---|---|---|---|---|
| 512 | Aminoisobutyric acid | 78 | 1.00 | 0 | 0 |
| 512 | Homocysteine | 72 | 0.92 | 0 | 0 |
| 512 | 3-Methylhistidine | 50 | 0.64 | 0 | 0 |
| 512 | Aminoadipic acid | 17 | 0.22 | 0 | 0 |
| 512 | Cystathionine | 6 | 0.08 | 0 | 0 |
| 512 | Cysteine | 5 | 0.06 | 0 | 0 |
| 512 | 1-Methylhistidine | 4 | 0.05 | 0 | 0 |
| 512 | Alanine | 0 | 0.00 | 0 | 0 |
| 512 | Anserine | 0 | 0.00 | 0 | 0 |
| 512 | Arginine | 0 | 0.00 | 0 | 0 |
| 512 | Asparagine | 0 | 0.00 | 0 | 0 |
| 512 | Aspartic acid | 0 | 0.00 | 0 | 0 |
| 512 | Carnosine | 0 | 0.00 | 0 | 0 |
| 512 | Citrulline | 0 | 0.00 | 0 | 0 |
| 512 | Ethanolamine | 0 | 0.00 | 0 | 0 |
| 512 | Glutamic acid | 0 | 0.00 | 0 | 0 |
| 512 | Glutamine | 0 | 0.00 | 0 | 0 |
| 512 | Glycine | 0 | 0.00 | 0 | 0 |
| 512 | Histidine | 0 | 0.00 | 0 | 0 |
| 512 | Hydroxyproline | 0 | 0.00 | 0 | 0 |
| 512 | Isoleucine | 0 | 0.00 | 0 | 0 |
| 512 | Leucine | 0 | 0.00 | 0 | 0 |
| 512 | Lysine | 0 | 0.00 | 0 | 0 |
| 512 | Methionine | 0 | 0.00 | 0 | 0 |
| 512 | Ornithine | 0 | 0.00 | 0 | 0 |
| 512 | Phenylalanine | 0 | 0.00 | 0 | 0 |
| 512 | Phosphoethanolamine | 0 | 0.00 | 0 | 0 |
| 512 | Proline | 0 | 0.00 | 0 | 0 |
| 512 | Sarcosine | 0 | 0.00 | 0 | 0 |
| 512 | Serine | 0 | 0.00 | 0 | 0 |
| 512 | Taurine | 0 | 0.00 | 0 | 0 |
| 512 | Threonine | 0 | 0.00 | 0 | 0 |
| 512 | Tryptophan | 0 | 0.00 | 0 | 0 |
| 512 | Tyrosine | 0 | 0.00 | 0 | 0 |
| 512 | Valine | 0 | 0.00 | 0 | 0 |
| 512 | 2-Aminobutyric acid | 0 | 0.00 | 0 | 0 |
| 512 | beta-Alanine | 0 | 0.00 | 0 | 0 |
| 512 | gamma-Aminobutyric acid | 0 | 0.00 | 0 | 0 |
| 513 | Aminoisobutyric acid | 78 | 1.00 | 0 | 0 |
| 513 | Homocysteine | 62 | 0.79 | 0 | 0 |
| 513 | 3-Methylhistidine | 41 | 0.53 | 0 | 0 |
| 513 | Aminoadipic acid | 11 | 0.14 | 0 | 0 |
| 513 | Cystathionine | 6 | 0.08 | 0 | 0 |
| 513 | Cysteine | 5 | 0.06 | 0 | 0 |
| 513 | 1-Methylhistidine | 1 | 0.01 | 0 | 0 |
| 513 | Alanine | 0 | 0.00 | 0 | 0 |
| 513 | Anserine | 0 | 0.00 | 0 | 0 |
| 513 | Arginine | 0 | 0.00 | 0 | 0 |
| 513 | Asparagine | 0 | 0.00 | 0 | 0 |
| 513 | Aspartic acid | 0 | 0.00 | 0 | 0 |
| 513 | Carnosine | 0 | 0.00 | 0 | 0 |
| 513 | Citrulline | 0 | 0.00 | 0 | 0 |
| 513 | Ethanolamine | 0 | 0.00 | 0 | 0 |
| 513 | Glutamic acid | 0 | 0.00 | 0 | 0 |
| 513 | Glutamine | 0 | 0.00 | 0 | 0 |
| 513 | Glycine | 0 | 0.00 | 0 | 0 |
| 513 | Histidine | 0 | 0.00 | 0 | 0 |
| 513 | Hydroxyproline | 0 | 0.00 | 0 | 0 |
| 513 | Isoleucine | 0 | 0.00 | 0 | 0 |
| 513 | Leucine | 0 | 0.00 | 0 | 0 |
| 513 | Lysine | 0 | 0.00 | 0 | 0 |
| 513 | Methionine | 0 | 0.00 | 0 | 0 |
| 513 | Ornithine | 0 | 0.00 | 0 | 0 |
| 513 | Phenylalanine | 0 | 0.00 | 0 | 0 |
| 513 | Phosphoethanolamine | 0 | 0.00 | 0 | 0 |
| 513 | Proline | 0 | 0.00 | 0 | 0 |
| 513 | Sarcosine | 0 | 0.00 | 0 | 0 |
| 513 | Serine | 0 | 0.00 | 0 | 0 |
| 513 | Taurine | 0 | 0.00 | 0 | 0 |
| 513 | Threonine | 0 | 0.00 | 0 | 0 |
| 513 | Tryptophan | 0 | 0.00 | 0 | 0 |
| 513 | Tyrosine | 0 | 0.00 | 0 | 0 |
| 513 | Valine | 0 | 0.00 | 0 | 0 |
| 513 | 2-Aminobutyric acid | 0 | 0.00 | 0 | 0 |
| 513 | beta-Alanine | 0 | 0.00 | 0 | 0 |
| 513 | gamma-Aminobutyric acid | 0 | 0.00 | 0 | 0 |
# Remove Metabolites with high NAor Zero Frequencies
################################################################################
met_rm <- na_df %>%
filter((ZERO_FREQ >= 0.95) | (NA_FREQ >= 0.8)) %>%
select(METABOLITE_NAME) %>% unlist() %>% unique()
metabolites <- metabolites[metabolites %!in% met_rm]
plot_df <- plot_df %>%
filter(METABOLITE_NAME %in% metabolites)
plot_df$METABOLITE_NAME <- as.character(plot_df$METABOLITE_NAME)# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in c(reps)){
# Collect numeric vector
num_vec <- plot_df %>%
filter(REPLICATE == rep) %>%
select(VALUE) %>% unlist() %>% unname()
df <- NumericSummaryStats(num_vec)
row.names(df) <- rep
df_all <- rbind(df_all, df)
}
df_all %>%
knitr::kable(format = "html") %>%
scroll_box(width = "100%", height = "200px")| NA_COUNT | NA_FREQ | MEAN | MEDIAN | MAX | MIN | MID_RANGE | VARIANCE | STD_DEV | SE_MEAN | Q1 | Q3 | KURTOSIS | SKEW | BC_LAMBDA | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 513 | 0 | 0 | 4.22 | 0.61 | 72.88 | 0 | 72.88 | 90.18 | 9.50 | 0.18 | 0.13 | 2.06 | 11.42 | 3.25 | None |
| 512 | 0 | 0 | 3.74 | 0.53 | 80.49 | 0 | 80.49 | 72.40 | 8.51 | 0.16 | 0.12 | 1.84 | 14.26 | 3.47 | None |
# Plot the density plot for all the gene counts
################################################################################
plot_df %>%
ggplot(aes(x = VALUE, color = REPLICATE)) +
geom_density() +
xlim(min(df_all$MIN), mean(df_all$Q3)) +
labs(title=paste0(tissue,' ',study_institute_metab_family,": Original Metabolite Abundances"),
x = "Abundance",
y = "Density")################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
unite(METABOLITE_NAME,REPLICATE,
col = METABOLITE_NAME_REPLICATE, remove = F) %>%
select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
split(.$REPLICATE) %>%
map(select, -REPLICATE) %>%
map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
map(arrange, labelid) %>%
map(tibble::column_to_rownames, var = "labelid") %>%
map(as.matrix)
# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
shared_met_mat <- do.call(cbind,x)
}
Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}
# Reorder the correlation matrix
cormat <- cor1
# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)
################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################
# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
pivot_wider(names_from = Var2, values_from = value) %>%
as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
select(-Var1) %>%
as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]
# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1),
seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))
# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks,
cluster_rows = F, cluster_cols = F,fontsize = 6)# Subset the data for sites to compare (scatterplots)
###################################################
plot_join_df <- countdata_df %>%
ungroup() %>%
unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
filter(TISSUE == tissue) %>%
filter(NAMED == named) %>%
filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
unnest(COUNT_DATA) %>%
filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
filter(METABOLITE_NAME %in% metabolites) %>%
unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
)) %>%
select(REPLICATE,METABOLITE_NAME,labelid,VALUE) %>%
mutate(REPLICATE = as.character(REPLICATE)) %>%
split(.$REPLICATE) %>%
map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
map(~select(., -REPLICATE)) %>%
purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))
# Plot scatter plots
################################################################################
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- plot_join_df %>%
rename(name = METABOLITE_NAME) %>%
rename(x = reps[1]) %>%
rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
met_df <- lm_df %>%
filter(name == metab)
r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
arrange(METABOLITE) %>%
select(METABOLITE,R2) %>%
arrange(desc(R2))
met_r2_df %>%
knitr::kable(format = "html") %>%
scroll_box(width = "100%", height = "200px")| METABOLITE | R2 |
|---|---|
| 2-Aminobutyric acid | 0.883 |
| Alanine | 0.53 |
| Lysine | 0.496 |
| Sarcosine | 0.494 |
| Aspartic acid | 0.491 |
| beta-Alanine | 0.462 |
| Proline | 0.423 |
| Isoleucine | 0.406 |
| Threonine | 0.392 |
| Tyrosine | 0.377 |
| Serine | 0.347 |
| Valine | 0.336 |
| Glutamic acid | 0.299 |
| Glycine | 0.274 |
| Leucine | 0.252 |
| Tryptophan | 0.226 |
| Ethanolamine | 0.224 |
| Citrulline | 0.22 |
| Ornithine | 0.22 |
| Glutamine | 0.203 |
| Methionine | 0.188 |
| Asparagine | 0.169 |
| Arginine | 0.158 |
| Hydroxyproline | 0.157 |
| Aminoadipic acid | 0.116 |
| Carnosine | 0.0951 |
| Phenylalanine | 0.0491 |
| Cysteine | 0.0437 |
| 1-Methylhistidine | 0.0335 |
| gamma-Aminobutyric acid | 0.0332 |
| Taurine | 0.0329 |
| Cystathionine | 0.0281 |
| Histidine | 0.0105 |
| Anserine | 0.00496 |
| Homocysteine | 0.00439 |
| Phosphoethanolamine | 0.0038 |
| 3-Methylhistidine | 0.00311 |
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()
# Plot the scatter plots
for(metab in scat_met_order){
p <- plot_join_df %>%
filter(METABOLITE_NAME == metab) %>%
ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", size = 1) +
geom_abline(linetype="dashed") +
facet_wrap(~ METABOLITE_NAME) +
expand_limits(x = 0, y = 0) +
#labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
coord_fixed(ratio=1)
plot(p)
}################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################
# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
unite(labelid,REPLICATE,
col = labelid_REPLICATE, remove = F) %>%
select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
split(.$REPLICATE) %>%
map(select, -REPLICATE) %>%
map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
map(as.matrix)
# Subset matrices
shared_sam_mat <- do.call(rbind,x) %>%
t()
# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}
# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)
################################################################################
####################### Unclustered ############################################
################################################################################
# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat
# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
pivot_wider(names_from = Var2, values_from = value) %>%
as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
select(-Var1) %>%
as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]
# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1),
seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))
# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks,
cluster_rows = F, cluster_cols = F,fontsize = 6)pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')
p0 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
theme(aspect.ratio=1)
p0p1 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
scale_color_manual(values=ec_colors) +
theme(aspect.ratio=1)
p1PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
theme(aspect.ratio=1)
p2# Normalize
# Note in this step, we use labelid to account for mayo's duplicate samples
################################################################################
norm_df <- plot_df %>%
unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
split(.$REPLICATE) %>%
map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
map(column_to_rownames, var = "labelid_viallabel") %>%
map(as.matrix) %>%
map(AutoScaleMatrix) %>%
map(as.data.frame) %>%
map(rownames_to_column, var = "labelid_viallabel") %>%
map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
bind_rows()# Plot Boxplots faceted by shared metabolites
################################################################################
norm_df %>%
ggplot(aes(y = METABOLITE_NAME, x = VALUE, color = REPLICATE)) +
geom_boxplot(alpha = 0.8) +
labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
x = "Abundance", y = "") ################################################################################
####################### Metabolite by Metabolite Correlations ##################
################################################################################
# Subset Matrices
x <- plot_df %>%
unite(METABOLITE_NAME,REPLICATE,
col = METABOLITE_NAME_REPLICATE, remove = F) %>%
select(METABOLITE_NAME_REPLICATE, labelid, VALUE,REPLICATE) %>%
split(.$REPLICATE) %>%
map(select, -REPLICATE) %>%
map(pivot_wider, names_from = METABOLITE_NAME_REPLICATE, values_from = VALUE) %>%
map(arrange, labelid) %>%
map(tibble::column_to_rownames, var = "labelid") %>%
map(as.matrix) %>%
map(AutoScaleMatrix)
# Subset matrices
if(all(row.names(x[[1]]) == row.names(x[[2]]))){
shared_met_mat <- do.call(cbind,x)
}
Corr <- 'Spearman'
# Create an NxN Matrix of correlation
if(Corr == 'Pearson'){
cor1 <- stats::cor(shared_met_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
cor1 <- stats::cor(shared_met_mat, method = 'spearman') %>% round(digits = 3)
}
# Reorder the correlation matrix
cormat <- cor1
# Melt the correlation matrix
melted_cormat <- melt(cormat, na.rm = TRUE)
################################################################################
####################### Only Reciprocal Pairs ##################################
################################################################################
# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat
# Widen the datafrmae back to matrix
cormat2 <- melted_cormat2 %>%
pivot_wider(names_from = Var2, values_from = value) %>%
as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
select(-Var1) %>%
as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]
# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1),
seq(1/paletteLength, max(cor1, na.rm = T), length.out=floor(paletteLength/2)))
# Plot the heatmap
pheatmap(cormat2, color=myColor, breaks=myBreaks,
cluster_rows = F, cluster_cols = F,fontsize = 6)# Normalize
# Subset the data for sites to compare (scatterplots)
###################################################
norm_join_df <- countdata_df %>%
ungroup() %>%
unite(STUDY_INSTITUTE,METAB_FAMILY, col = "STUDY_INSTITUTE_METAB_FAMILY") %>%
filter(TISSUE == tissue) %>%
filter(NAMED == named) %>%
filter(STUDY_INSTITUTE_METAB_FAMILY == study_institute_metab_family) %>%
unnest(COUNT_DATA) %>%
filter(viallabel %in% as.character(unlist(sample_join$sample_id))) %>%
unite(labelid, viallabel, col = "labelid_viallabel", remove = F) %>%
mutate(REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
split(.$REPLICATE) %>%
map(select, labelid_viallabel, METABOLITE_NAME, VALUE) %>%
map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
map(column_to_rownames, var = "labelid_viallabel") %>%
map(as.matrix) %>%
map(AutoScaleMatrix) %>%
map(as.data.frame) %>%
map(rownames_to_column, var = "labelid_viallabel") %>%
map(pivot_longer, names_to = 'METABOLITE_NAME', values_to = 'VALUE', cols = all_of(metabolites)) %>%
map(mutate, REPLICATE = case_when(grepl(paste0(reps[1],"$"),labelid_viallabel) ~ reps[1],
grepl(paste0(reps[2],"$"),labelid_viallabel) ~ reps[2],
grepl(paste0(reps[3],"$"),labelid_viallabel) ~ reps[3])) %>%
map(~rename(., !!sym(unique(.$REPLICATE)) := "VALUE")) %>%
map(~select(., -REPLICATE)) %>%
map(mutate, labelid = gsub(pattern = "_..*", replacement = '', labelid_viallabel)) %>%
map(select, -labelid_viallabel) %>%
purrr::reduce(left_join, by = c("labelid","METABOLITE_NAME"))
# Plot scatter plots ordered by sample (include R2 values)
lm_df <- norm_join_df %>%
rename(name = METABOLITE_NAME) %>%
rename(x = reps[1]) %>%
rename(y = reps[2])
# Iterate through each of the shared metabolites to collect summary info about their correlations
r2_df <- data.frame()
for(metab in metabolites){
met_df <- lm_df %>%
filter(name == metab)
r2_df <- rbind(r2_df, lm_eqn(met_df))
}
# Display summaries
met_r2_df <- r2_df %>%
arrange(METABOLITE) %>%
select(METABOLITE,R2) %>%
arrange(desc(R2))
met_r2_df %>%
knitr::kable(format = "html") %>%
scroll_box(width = "100%", height = "200px")| METABOLITE | R2 |
|---|---|
| 2-Aminobutyric acid | 0.883 |
| Alanine | 0.53 |
| Lysine | 0.496 |
| Sarcosine | 0.494 |
| Aspartic acid | 0.491 |
| beta-Alanine | 0.462 |
| Proline | 0.423 |
| Isoleucine | 0.406 |
| Threonine | 0.392 |
| Tyrosine | 0.377 |
| Serine | 0.347 |
| Valine | 0.336 |
| Glutamic acid | 0.299 |
| Glycine | 0.274 |
| Leucine | 0.252 |
| Tryptophan | 0.226 |
| Ethanolamine | 0.224 |
| Citrulline | 0.22 |
| Ornithine | 0.22 |
| Glutamine | 0.203 |
| Methionine | 0.188 |
| Asparagine | 0.169 |
| Arginine | 0.158 |
| Hydroxyproline | 0.157 |
| Aminoadipic acid | 0.116 |
| Carnosine | 0.0951 |
| Phenylalanine | 0.0491 |
| Cysteine | 0.0437 |
| 1-Methylhistidine | 0.0335 |
| gamma-Aminobutyric acid | 0.0332 |
| Taurine | 0.0329 |
| Cystathionine | 0.0281 |
| Histidine | 0.0105 |
| Anserine | 0.00496 |
| Homocysteine | 0.00439 |
| Phosphoethanolamine | 0.0038 |
| 3-Methylhistidine | 0.00311 |
# Collect the order of metabolites
scat_met_order <- met_r2_df %>% select(METABOLITE) %>% unlist() %>% as.character()
# Plot scatter plots (Normalized)
################################################################################
# Plot the scatter plots
for(metab in scat_met_order){
p <- norm_join_df %>%
filter(METABOLITE_NAME == metab) %>%
ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", size = 1) +
geom_abline(linetype="dashed") +
facet_wrap(~ METABOLITE_NAME) +
expand_limits(x = 0, y = 0) +
#labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
coord_fixed(ratio=1)
plot(p)
}# norm_join_df %>%
# mutate(METABOLITE_NAME = factor(METABOLITE_NAME, levels = scat_met_order)) %>%
# ggplot(aes(x = !!sym(reps[1]), y = !!sym(reps[2])), color = ) +
# geom_point(alpha = 0.5) +
# geom_smooth(method = "lm", size = 1) +
# geom_abline(linetype="dashed") +
# facet_wrap(~ METABOLITE_NAME) +
# expand_limits(x = 0, y = 0) +
# labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances")) +
# coord_fixed(ratio=1)# Plot the density plot for all the gene counts
################################################################################
norm_df %>%
ggplot(aes(x = VALUE, color = REPLICATE)) +
geom_density() +
labs(title=paste0(tissue,' ',study_institute_metab_family,": Normalized Metabolite Abundances"),
x = "Abundance",
y = "Density")# Summary Statistics of Density plot distribution
################################################################################
# Iterate through the runs and collect vectors
df_all <- data.frame()
for(rep in reps){
# Collect numeric vector
num_vec <- norm_df %>%
filter(REPLICATE == rep) %>%
select(VALUE) %>% unlist() %>% unname()
df <- NumericSummaryStats(num_vec)
row.names(df) <- rep
df_all <- rbind(df_all, df)
}
df_all## NA_COUNT NA_FREQ MEAN MEDIAN MAX MIN MID_RANGE VARIANCE STD_DEV SE_MEAN Q1 Q3 KURTOSIS SKEW
## 513 0 0 0 -0.12 4.59 -2.61 7.20 0.99 0.99 0.02 -0.71 0.59 0.87 0.68
## 512 0 0 0 -0.15 6.41 -2.45 8.86 0.99 0.99 0.02 -0.67 0.57 1.42 0.82
## BC_LAMBDA
## 513 None
## 512 None
################################################################################
####################### Sample by Sample Correlations ##########################
################################################################################
# Subset Data
################################################################################
# Original NxP Matrix
x <- plot_df %>%
unite(labelid,REPLICATE,
col = labelid_REPLICATE, remove = F) %>%
select(labelid_REPLICATE, METABOLITE_NAME, VALUE,REPLICATE) %>%
split(.$REPLICATE) %>%
map(select, -REPLICATE) %>%
map(pivot_wider, names_from = METABOLITE_NAME, values_from = VALUE) %>%
map(tibble::column_to_rownames, var = "labelid_REPLICATE") %>%
map(as.matrix) %>%
map(AutoScaleMatrix)
# Subset matrices
shared_sam_mat <- do.call(rbind,x) %>%
t()
# Create an NxN Matrix of correlation
Corr <- 'Spearman'
if(Corr == 'Pearson'){
cor1 <- stats::cor(shared_sam_mat, method = 'pearson') %>% round(digits = 3)
}else if(Corr == 'Spearman'){
cor1 <- stats::cor(shared_sam_mat, method = 'spearman') %>% round(digits = 3)
}
# Melt the correlation matrix
melted_cormat <- melt(cor1, na.rm = TRUE)
################################################################################
####################### Unclustered ############################################
################################################################################
# Remove rows that compare metabolites from the same dataset, them remove redundent measurements
melted_cormat2 <- melted_cormat
# Widen the dataframwe back to matrix
cormat2 <- melted_cormat2 %>%
pivot_wider(names_from = Var2, values_from = value) %>%
as.data.frame()
row.names(cormat2) <- cormat2$Var1
cormat2 <- cormat2 %>%
select(-Var1) %>%
as.matrix()
cormat2 <- cormat2[sort(rownames(cormat2)),sort(colnames(cormat2))]
cormat2 <- cormat2[grepl(reps[1], rownames(cormat2)),grepl(reps[2], colnames(cormat2))]
# Orient the colors and breaks
paletteLength <- 1000
myColor <- colorRampPalette(c("blue", "white", "red"))(paletteLength)
# length(breaks) == length(paletteLength) + 1
# use floor and ceiling to deal with even/odd length pallettelengths
myBreaks <- c(seq(-1, 0, length.out=ceiling(paletteLength/2) + 1),
seq(1/paletteLength, max(cor1, na.rm = TRUE), length.out=floor(paletteLength/2)))
# Plot the heatmap
pheatmap(as.matrix(cormat2), color=myColor, breaks=myBreaks,
cluster_rows = F, cluster_cols = F,fontsize = 6)# Plot a PCA with outliers labeled
# shared_sam_mat <- shared_sam_mat %>%
# t() %>% AutoScaleMatrix() %>%
# t()
pca <- prcomp(na.omit(t(shared_sam_mat)), scale. = F)
percentVar <- pca$sdev^2/sum(pca$sdev^2)
PC <- pca$x %>% as.data.frame()
PC$labelid_rep <- as.character(row.names(pca$x))
PC <- PC %>%
tidyr::separate(col = labelid_rep, into = c('labelid', 'REPLICATE'),
sep = "_", remove = T)
# Join the phenotype data
PC <- left_join(PC, pheno_df, by = 'labelid')
p0 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
theme(aspect.ratio=1)
p0p1 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=Key.anirandgroup, shape = REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
scale_color_manual(values=ec_colors) +
theme(aspect.ratio=1)
p1PC$Registration.sex <- as.character(PC$Registration.sex)
p2 <- PC %>%
ggplot(aes(x = PC1, y = PC2, color=Registration.sex, shape = REPLICATE)) +
geom_point(size = 3) +
geom_line(aes(group = labelid),color="dark grey") +
ggtitle('Shared Samples and Shared Metabolites') +
xlab(paste0('PC1: ',round(percentVar[1]*100,2),'% variance')) +
ylab(paste0('PC2: ',round(percentVar[2]*100,2),'% variance')) +
theme(aspect.ratio=1)
p2